import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
diabetes=pd.read_csv(r"E:\Ankit Jain\D drive\Aviraj Personal File\IMS Analytics Class\Github sets\Diabetes Analysis\diabetes.csv")
diabetes.head()
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
| 1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
| 2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
| 3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
| 4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
diabetes.rename({"DiabetesPedigreeFunction":"Pedigree"},axis=1,inplace=True)
diabetes.isnull().sum()
Pregnancies 0 Glucose 0 BloodPressure 0 SkinThickness 0 Insulin 0 BMI 0 Pedigree 0 Age 0 Outcome 0 dtype: int64
diabetes.describe()
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | Pedigree | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| count | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 |
| mean | 3.845052 | 120.894531 | 69.105469 | 20.536458 | 79.799479 | 31.992578 | 0.471876 | 33.240885 | 0.348958 |
| std | 3.369578 | 31.972618 | 19.355807 | 15.952218 | 115.244002 | 7.884160 | 0.331329 | 11.760232 | 0.476951 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.078000 | 21.000000 | 0.000000 |
| 25% | 1.000000 | 99.000000 | 62.000000 | 0.000000 | 0.000000 | 27.300000 | 0.243750 | 24.000000 | 0.000000 |
| 50% | 3.000000 | 117.000000 | 72.000000 | 23.000000 | 30.500000 | 32.000000 | 0.372500 | 29.000000 | 0.000000 |
| 75% | 6.000000 | 140.250000 | 80.000000 | 32.000000 | 127.250000 | 36.600000 | 0.626250 | 41.000000 | 1.000000 |
| max | 17.000000 | 199.000000 | 122.000000 | 99.000000 | 846.000000 | 67.100000 | 2.420000 | 81.000000 | 1.000000 |
diabetes.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 768 entries, 0 to 767 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Pregnancies 768 non-null int64 1 Glucose 768 non-null int64 2 BloodPressure 768 non-null int64 3 SkinThickness 768 non-null int64 4 Insulin 768 non-null int64 5 BMI 768 non-null float64 6 Pedigree 768 non-null float64 7 Age 768 non-null int64 8 Outcome 768 non-null int64 dtypes: float64(2), int64(7) memory usage: 54.1 KB
diabetes.columns
Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
'BMI', 'Pedigree', 'Age', 'Outcome'],
dtype='object')
for column in diabetes:
print(diabetes[column].value_counts())
print()
1 135
0 111
2 103
3 75
4 68
5 57
6 50
7 45
8 38
9 28
10 24
11 11
13 10
12 9
14 2
15 1
17 1
Name: Pregnancies, dtype: int64
100 17
99 17
129 14
125 14
111 14
..
177 1
172 1
169 1
160 1
199 1
Name: Glucose, Length: 136, dtype: int64
70 57
74 52
68 45
78 45
72 44
64 43
80 40
76 39
60 37
0 35
62 34
66 30
82 30
88 25
84 23
90 22
86 21
58 21
50 13
56 12
52 11
54 11
92 8
75 8
65 7
94 6
85 6
48 5
44 4
96 4
110 3
100 3
98 3
106 3
108 2
104 2
30 2
55 2
46 2
40 1
38 1
24 1
95 1
61 1
102 1
114 1
122 1
Name: BloodPressure, dtype: int64
0 227
32 31
30 27
27 23
23 22
33 20
18 20
28 20
31 19
39 18
19 18
29 17
37 16
26 16
22 16
40 16
25 16
35 15
41 15
36 14
15 14
17 14
20 13
24 12
42 11
13 11
21 10
34 8
46 8
38 7
12 7
14 6
16 6
11 6
43 6
45 6
10 5
44 5
48 4
47 4
50 3
49 3
54 2
52 2
7 2
8 2
60 1
56 1
63 1
51 1
99 1
Name: SkinThickness, dtype: int64
0 374
105 11
140 9
130 9
120 8
...
271 1
270 1
108 1
112 1
846 1
Name: Insulin, Length: 186, dtype: int64
32.0 13
31.6 12
31.2 12
0.0 11
33.3 10
..
32.1 1
52.9 1
31.3 1
45.7 1
42.8 1
Name: BMI, Length: 248, dtype: int64
0.254 6
0.258 6
0.259 5
0.238 5
0.207 5
..
0.886 1
0.804 1
1.251 1
0.382 1
0.375 1
Name: Pedigree, Length: 517, dtype: int64
22 72
21 63
25 48
24 46
23 38
28 35
26 33
27 32
29 29
31 24
41 22
30 21
37 19
42 18
33 17
32 16
36 16
38 16
45 15
34 14
40 13
43 13
46 13
39 12
35 10
50 8
44 8
51 8
52 8
58 7
47 6
54 6
57 5
60 5
48 5
49 5
53 5
55 4
62 4
63 4
66 4
56 3
59 3
65 3
67 3
61 2
69 2
72 1
64 1
68 1
70 1
81 1
Name: Age, dtype: int64
0 500
1 268
Name: Outcome, dtype: int64
sns.pairplot(diabetes)
<seaborn.axisgrid.PairGrid at 0x1ce82f93550>
plt.figure(figsize=(15,5))
sns.boxplot(data=diabetes)
<AxesSubplot:>
diabetes.corr()
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | Pedigree | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| Pregnancies | 1.000000 | 0.129459 | 0.141282 | -0.081672 | -0.073535 | 0.017683 | -0.033523 | 0.544341 | 0.221898 |
| Glucose | 0.129459 | 1.000000 | 0.152590 | 0.057328 | 0.331357 | 0.221071 | 0.137337 | 0.263514 | 0.466581 |
| BloodPressure | 0.141282 | 0.152590 | 1.000000 | 0.207371 | 0.088933 | 0.281805 | 0.041265 | 0.239528 | 0.065068 |
| SkinThickness | -0.081672 | 0.057328 | 0.207371 | 1.000000 | 0.436783 | 0.392573 | 0.183928 | -0.113970 | 0.074752 |
| Insulin | -0.073535 | 0.331357 | 0.088933 | 0.436783 | 1.000000 | 0.197859 | 0.185071 | -0.042163 | 0.130548 |
| BMI | 0.017683 | 0.221071 | 0.281805 | 0.392573 | 0.197859 | 1.000000 | 0.140647 | 0.036242 | 0.292695 |
| Pedigree | -0.033523 | 0.137337 | 0.041265 | 0.183928 | 0.185071 | 0.140647 | 1.000000 | 0.033561 | 0.173844 |
| Age | 0.544341 | 0.263514 | 0.239528 | -0.113970 | -0.042163 | 0.036242 | 0.033561 | 1.000000 | 0.238356 |
| Outcome | 0.221898 | 0.466581 | 0.065068 | 0.074752 | 0.130548 | 0.292695 | 0.173844 | 0.238356 | 1.000000 |
plt.figure(figsize=(15,5),frameon=False)
sns.heatmap(diabetes.corr(),annot=True)
<AxesSubplot:>
import plotly.express as px
px.scatter(diabetes,x="Insulin",y="BMI")
px.scatter(diabetes,x="SkinThickness",y="Pedigree")
diabetes.head(3)
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | Pedigree | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
| 1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
| 2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
pd.crosstab(diabetes["Outcome"],diabetes["Pregnancies"],margins=True)
| Pregnancies | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 17 | All |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Outcome | ||||||||||||||||||
| 0 | 73 | 106 | 84 | 48 | 45 | 36 | 34 | 20 | 16 | 10 | 14 | 4 | 5 | 5 | 0 | 0 | 0 | 500 |
| 1 | 38 | 29 | 19 | 27 | 23 | 21 | 16 | 25 | 22 | 18 | 10 | 7 | 4 | 5 | 2 | 1 | 1 | 268 |
| All | 111 | 135 | 103 | 75 | 68 | 57 | 50 | 45 | 38 | 28 | 24 | 11 | 9 | 10 | 2 | 1 | 1 | 768 |
diabetes["Age"].min(),diabetes["Age"].max()
(21, 81)
age=pd.cut(diabetes["Age"],[20,30,40,50,60,70,80,90])
diabetes.pivot_table(values=["Pregnancies","Glucose","BloodPressure","SkinThickness","Insulin"],
index=["Outcome",age],aggfunc=sum,margins=True,observed=True)
| BloodPressure | Glucose | Insulin | Pregnancies | SkinThickness | ||
|---|---|---|---|---|---|---|
| Outcome | Age | |||||
| 0 | (20, 30] | 21486 | 34982 | 24900 | 636 | 6819 |
| (30, 40] | 5746 | 9246 | 5875 | 428 | 1628 | |
| (40, 50] | 3606 | 5292 | 2165 | 320 | 953 | |
| (50, 60] | 1816 | 2865 | 1026 | 156 | 218 | |
| (60, 70] | 1364 | 2352 | 370 | 98 | 181 | |
| (70, 80] | 0 | 119 | 0 | 2 | 0 | |
| (80, 90] | 74 | 134 | 60 | 9 | 33 | |
| 1 | (20, 30] | 5754 | 12629 | 10256 | 201 | 2270 |
| (30, 40] | 5287 | 10564 | 6168 | 400 | 1597 | |
| (40, 50] | 4845 | 8820 | 4646 | 485 | 1238 | |
| (50, 60] | 2507 | 4757 | 5820 | 196 | 718 | |
| (60, 70] | 588 | 1087 | 0 | 22 | 117 | |
| All | 53073 | 92847 | 61286 | 2953 | 15772 |
diabetes.pivot_table(values=["Glucose","BloodPressure","SkinThickness","Insulin"],
index= ["Outcome",age],aggfunc="sum").plot()
<AxesSubplot:xlabel='Outcome,Age'>
x=diabetes.iloc[ : ,0:8]
y=diabetes.Outcome
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
logreg=LogisticRegression()
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=.25,random_state=3)
logreg.fit(xtrain,ytrain)
LogisticRegression()
ypred=logreg.predict(xtest)
ypred
array([0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,
0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1,
1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0,
0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0,
0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1,
1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1,
1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0], dtype=int64)
pd.DataFrame({"Actual": ytest[0:20],"Predicted" :ypred[0:20]})
| Actual | Predicted | |
|---|---|---|
| 51 | 0 | 0 |
| 378 | 1 | 1 |
| 27 | 0 | 0 |
| 579 | 1 | 1 |
| 86 | 0 | 0 |
| 144 | 0 | 0 |
| 65 | 0 | 0 |
| 617 | 0 | 0 |
| 10 | 0 | 0 |
| 522 | 0 | 0 |
| 535 | 1 | 0 |
| 123 | 0 | 0 |
| 389 | 0 | 0 |
| 399 | 1 | 1 |
| 270 | 1 | 1 |
| 81 | 0 | 0 |
| 80 | 0 | 0 |
| 450 | 0 | 0 |
| 693 | 1 | 1 |
| 648 | 1 | 0 |
metrics.confusion_matrix(ytest,ypred)
array([[96, 16],
[36, 44]], dtype=int64)
round((96+44)/(96+16+44+36)*100)
73
print(metrics.classification_report(ytest,ypred))
precision recall f1-score support
0 0.73 0.86 0.79 112
1 0.73 0.55 0.63 80
accuracy 0.73 192
macro avg 0.73 0.70 0.71 192
weighted avg 0.73 0.73 0.72 192
import statsmodels.api as sma
logmodel=sma.Logit(y,x)
result=logmodel.fit()
print(result.summary2())
Optimization terminated successfully.
Current function value: 0.608498
Iterations 5
Results: Logit
=================================================================
Model: Logit Pseudo R-squared: 0.059
Dependent Variable: Outcome AIC: 950.6528
Date: 2021-06-19 15:00 BIC: 987.8031
No. Observations: 768 Log-Likelihood: -467.33
Df Model: 7 LL-Null: -496.74
Df Residuals: 760 LLR p-value: 2.5825e-10
Converged: 1.0000 Scale: 1.0000
No. Iterations: 5.0000
-----------------------------------------------------------------
Coef. Std.Err. z P>|z| [0.025 0.975]
-----------------------------------------------------------------
Pregnancies 0.1284 0.0286 4.4843 0.0000 0.0723 0.1845
Glucose 0.0129 0.0027 4.7568 0.0000 0.0076 0.0183
BloodPressure -0.0303 0.0047 -6.4806 0.0000 -0.0395 -0.0212
SkinThickness 0.0002 0.0061 0.0323 0.9742 -0.0117 0.0121
Insulin 0.0007 0.0008 0.9420 0.3462 -0.0008 0.0023
BMI -0.0048 0.0107 -0.4494 0.6531 -0.0258 0.0162
Pedigree 0.3203 0.2399 1.3351 0.1818 -0.1499 0.7905
Age -0.0156 0.0084 -1.8517 0.0641 -0.0322 0.0009
=================================================================
print(round(metrics.accuracy_score(ytest,ypred)*100))
print(round(metrics.precision_score(ytest,ypred)*100))
print(round(metrics.recall_score(ytest,ypred)*100))
73 73 55